Data Exploration

# Load necessary libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(skimr)
library(rstudioapi)
library(inspectdf)
library(mice)
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(recipes)
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(purrr)
library(graphics) 
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:plotly':
## 
##     subplot
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(glue)
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, month, week, year
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
# Load dataset (mpg from ggplot2)
data <- ggplot2::mpg

# Explore data using skimr and inspect_na
data %>% skim()
Data summary
Name Piped data
Number of rows 234
Number of columns 11
_______________________
Column type frequency:
character 6
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
manufacturer 0 1 4 10 0 15 0
model 0 1 2 22 0 38 0
trans 0 1 8 10 0 10 0
drv 0 1 1 1 0 3 0
fl 0 1 1 1 0 5 0
class 0 1 3 10 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
displ 0 1 3.47 1.29 1.6 2.4 3.3 4.6 7 ▇▆▆▃▁
year 0 1 2003.50 4.51 1999.0 1999.0 2003.5 2008.0 2008 ▇▁▁▁▇
cyl 0 1 5.89 1.61 4.0 4.0 6.0 8.0 8 ▇▁▇▁▇
cty 0 1 16.86 4.26 9.0 14.0 17.0 19.0 35 ▆▇▃▁▁
hwy 0 1 23.44 5.95 12.0 18.0 24.0 27.0 44 ▅▅▇▁▁
data %>% inspect_na()
## # A tibble: 11 × 3
##    col_name       cnt  pcnt
##    <chr>        <int> <dbl>
##  1 manufacturer     0     0
##  2 model            0     0
##  3 displ            0     0
##  4 year             0     0
##  5 cyl              0     0
##  6 trans            0     0
##  7 drv              0     0
##  8 cty              0     0
##  9 hwy              0     0
## 10 fl               0     0
## 11 class            0     0
# Identify numeric variables
names(data)
##  [1] "manufacturer" "model"        "displ"        "year"         "cyl"         
##  [6] "trans"        "drv"          "cty"          "hwy"          "fl"          
## [11] "class"
num_vars <- data %>%
  select_if(is.numeric) %>%
  names()
num_vars
## [1] "displ" "year"  "cyl"   "cty"   "hwy"
# Identify and handle outliers using boxplots
for (b in num_vars) {
  OutVals <- boxplot(data[[b]])$out
  if (length(OutVals) > 0) {
    print(paste0("----", b))
    print(OutVals)
  }
}

## [1] "----cty"
## [1] 28 28 33 35 29

## [1] "----hwy"
## [1] 44 44 41
# Replace outliers in 'cty' variable with quartile values
OutVals <- boxplot(data[["cty"]])$out

median <- median(data[["cty"]])

o3 <- ifelse(OutVals > median, OutVals, NA) %>% na.omit() %>% as.matrix() %>% t() %>% .[1,]
o1 <- ifelse(OutVals < median, OutVals, NA) %>% na.omit() %>% as.matrix() %>% t() %>% .[1,]

data <- na.omit(data)

val75 <- quantile(data[["cty"]], 0.75)
val25 <- quantile(data[["cty"]], 0.25)

data[which(data[["cty"]] %in% o3), "cty"] <- val75
data[which(data[["cty"]] %in% o1), "cty"] <- val25
boxplot(data[["cty"]])

# Prepare data for modeling
names(data)
##  [1] "manufacturer" "model"        "displ"        "year"         "cyl"         
##  [6] "trans"        "drv"          "cty"          "hwy"          "fl"          
## [11] "class"
target <- "cty"
features <- data %>% select(c("year", "cyl", "displ"))

Modeling

# Build and refine a linear regression model using H2O
f <- as.formula(paste(target, paste(features, collapse = " + "), sep = " ~ "))
glm <- glm(f, data = data)
glm %>% summary()
## 
## Call:
## glm(formula = f, data = data)
## 
## Coefficients:
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Estimate
## (Intercept)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               -172.06798
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n    1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n    1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008)    0.09952
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n    4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       -1.20798
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n    1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n    2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6)                                                                                                                                                                                                                                                                                                                                             -1.03653
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Std. Error
## (Intercept)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 56.82357
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n    1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n    1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008)    0.02838
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n    4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        0.21401
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n    1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n    2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6)                                                                                                                                                                                                                                                                                                                                              0.26789
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           t value
## (Intercept)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                -3.028
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n    1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n    1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008)   3.507
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n    4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      -5.644
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n    1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n    2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6)                                                                                                                                                                                                                                                                                                                                            -3.869
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Pr(>|t|)
## (Intercept)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               0.002742
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n    1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n    1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008) 0.000545
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n    4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     4.85e-08
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n    1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n    2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6)                                                                                                                                                                                                                                                                                                                                           0.000142
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## (Intercept)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               ** 
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n    1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n    1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008) ***
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n    4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     ***
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n    1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n    2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6)                                                                                                                                                                                                                                                                                                                                           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3.725912)
## 
##     Null deviance: 3243.61  on 233  degrees of freedom
## Residual deviance:  856.96  on 230  degrees of freedom
## AIC: 977.81
## 
## Number of Fisher Scoring iterations: 2
# Check and handle multicollinearity using VIF
library(faraway)
## 
## Attaching package: 'faraway'
## The following object is masked from 'package:lattice':
## 
##     melanoma
## The following object is masked from 'package:mice':
## 
##     mammalsleep
while (glm %>% faraway::vif() %>% sort(decreasing = TRUE) %>% .[1] >= 1.5) {
  afterVIF <- glm %>% faraway::vif() %>% sort(decreasing = TRUE) %>% .[-1] %>% names()
  f <- as.formula(paste(target, paste(afterVIF, collapse = " + "), sep = " ~ "))
  glm <- glm(f, data = data)
}

# Display final VIF results
glm %>% faraway::vif() %>% sort(decreasing = TRUE) %>% names() -> features

# Prepare data for modeling
data <- data %>%
  select(cty, year, cyl, displ) %>%
  glimpse()
## Rows: 234
## Columns: 4
## $ cty   <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 15, 15, …
## $ year  <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008…
## $ cyl   <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8…
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.8, 2.8,…
# Standardize predictor variables
data[, -1] <- data[, -1] %>% scale() %>% as.data.frame()

# Initialize H2O
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         41 minutes 11 seconds 
##     H2O cluster timezone:       Asia/Baku 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.42.0.2 
##     H2O cluster version age:    5 months and 3 days 
##     H2O cluster name:           H2O_started_from_R_ACER_osn291 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.77 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.1 (2023-06-16 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (5 months and 3 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert data to H2O frame
h2o_data <- data %>% as.h2o()
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
h2o_data <- h2o_data %>% h2o.splitFrame(ratios = 0.8, seed = 123)

# Define target and features
target <- "cty"
features <- data %>% select(c("year", "cyl", "displ")) %>% names()

# Split data into training and testing sets
train <- h2o_data[[1]]
test <- h2o_data[[2]]

# Build H2O GLM model
model <- h2o.glm(
  x = features, y = target,
  training_frame = train,
  validation_frame = test,
  seed = 123, nfolds = 10,
  lambda = 0,
  compute_p_values = TRUE
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Display model summary
summary(model)
## Model Details:
## ==============
## 
## H2ORegressionModel: glm
## Model Key:  GLM_model_R_1703759888795_16 
## GLM Model: summary
##     family     link regularization number_of_predictors_total
## 1 gaussian identity           None                          3
##   number_of_active_predictors number_of_iterations  training_frame
## 1                           3                    1 RTMP_sid_82fc_3
## 
## H2ORegressionMetrics: glm
## ** Reported on training data. **
## 
## MSE:  3.43623
## RMSE:  1.853707
## MAE:  1.428847
## RMSLE:  0.1110743
## Mean Residual Deviance :  3.43623
## R^2 :  0.7391495
## Null Deviance :2423.864
## Null D.o.F. :183
## Residual Deviance :632.2662
## Residual D.o.F. :180
## AIC :759.2943
## 
## 
## H2ORegressionMetrics: glm
## ** Reported on validation data. **
## 
## MSE:  4.630496
## RMSE:  2.151859
## MAE:  1.678881
## RMSLE:  0.1379459
## Mean Residual Deviance :  4.630496
## R^2 :  0.7154596
## Null Deviance :821.3956
## Null D.o.F. :49
## Residual Deviance :231.5248
## Residual D.o.F. :46
## AIC :228.5271
## 
## 
## H2ORegressionMetrics: glm
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  3.639883
## RMSE:  1.907848
## MAE:  1.463574
## RMSLE:  0.114575
## Mean Residual Deviance :  3.639883
## R^2 :  0.7236898
## Null Deviance :2476.279
## Null D.o.F. :183
## Residual Deviance :669.7385
## Residual D.o.F. :180
## AIC :769.8885
## 
## 
## Cross-Validation Metrics Summary: 
##                              mean        sd cv_1_valid cv_2_valid cv_3_valid
## mae                      1.447927  0.303751   1.552973   1.559963   1.142328
## mean_residual_deviance   3.530423  1.323507   3.621001   4.204665   2.452665
## mse                      3.530423  1.323507   3.621001   4.204665   2.452665
## null_deviance          247.627900 96.894516 239.961460 306.785000 241.668290
## r2                       0.674627  0.161133   0.758537   0.634322   0.818344
## residual_deviance       66.973850 32.842900  57.936016 100.911970  41.695300
## rmse                     1.845679  0.371024   1.902893   2.050528   1.566099
## rmsle                    0.108994  0.023277   0.101542   0.144045   0.077322
##                        cv_4_valid cv_5_valid cv_6_valid cv_7_valid cv_8_valid
## mae                      0.880118   1.857747   1.815711   1.306672   1.337201
## mean_residual_deviance   1.180759   6.065352   4.356180   2.728392   3.317573
## mse                      1.180759   6.065352   4.356180   2.728392   3.317573
## null_deviance          241.559570 411.962200 136.014180  73.008820 309.713260
## r2                       0.861630   0.607683   0.487508   0.341815   0.718680
## residual_deviance       17.711380 127.372380  69.698880  46.382668  86.256905
## rmse                     1.086627   2.462793   2.087146   1.651784   1.821421
## rmsle                    0.077603   0.137223   0.123147   0.104629   0.123642
##                        cv_9_valid cv_10_valid
## mae                      1.666621    1.359937
## mean_residual_deviance   4.310615    3.067023
## mse                      4.310615    3.067023
## null_deviance          316.324000  199.282270
## r2                       0.720221    0.797529
## residual_deviance       81.901700   39.871300
## rmse                     2.076202    1.751292
## rmsle                    0.110992    0.089790
## 
## Scoring History: 
##             timestamp   duration iterations negative_log_likelihood objective
## 1 2023-12-28 15:19:29  0.000 sec          0              2423.86413  13.17317
## 2 2023-12-28 15:19:29  0.003 sec          1                      NA        NA
##   training_rmse training_deviance training_mae training_r2 validation_rmse
## 1            NA                NA           NA          NA              NA
## 2       1.85371           3.43623      1.42885     0.73915         2.15186
##   validation_deviance validation_mae validation_r2
## 1                  NA             NA            NA
## 2             4.63050        1.67888       0.71546
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##   variable relative_importance scaled_importance percentage
## 1    displ            1.703743          1.000000   0.462959
## 2      cyl            1.509642          0.886074   0.410216
## 3     year            0.466729          0.273943   0.126825
# Display coefficients with p-values
model@model$coefficients_table %>%
  as.data.frame() %>%
  dplyr::select(names, p_value) %>%
  mutate(p_value = round(p_value, 3)) %>%
  .[-1,] %>%
  arrange(desc(p_value))
##   names p_value
## 1  year   0.001
## 2   cyl   0.000
## 3 displ   0.000

Evaluation

# Make predictions on the test set
y_pred <- model %>% h2o.predict(newdata = test) %>% as.data.frame() 
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
y_pred$predict
##  [1] 20.22789 20.89541 17.53890 10.48787 19.43621 16.34359 17.27501 16.74722
##  [9] 13.52266 13.52266 13.52266 11.93151 13.52266 13.52266 13.52266 16.35138
## [17] 12.72319 13.65460 21.15930 21.15930 20.89541 20.36762 20.89541 18.06669
## [25] 18.06669 11.67539 13.51488 11.66761 20.23567 16.35138 19.30426 19.30426
## [33] 20.23567 19.70010 20.23567 19.70010 19.70010 20.36762 16.73944 17.01111
## [41] 19.70010 16.73944 16.73944 12.59124 16.21165 19.96400 19.96400 19.96400
## [49] 20.22789 20.89541
# Evaluate model performance on the test set
test_set <- test %>% as.data.frame()
residuals <- test_set$cty - y_pred$predict
RMSE = sqrt(mean(residuals^2))

# Calculate R-squared and Adjusted R-squared
y_test_mean = mean(test_set$cty)
tss = sum((test_set$cty - y_test_mean)^2) 
rss = sum(residuals^2)
R2 = 1 - (rss/tss)
n <- test_set %>% nrow() 
k <- features %>% length() 
Adjusted_R2 = 1 - (1 - R2) * ((n - 1) / (n - k - 1))

# Display evaluation metrics
tibble(RMSE = round(RMSE, 1),
       R2, Adjusted_R2)
## # A tibble: 1 × 3
##    RMSE    R2 Adjusted_R2
##   <dbl> <dbl>       <dbl>
## 1   2.2 0.715       0.697
# Create a dataframe for observed and predicted values
my_data <- cbind(predicted = y_pred$predict,
                 observed = test_set$cty) %>%
  as.data.frame()

# Visualize the results using ggplot
g <- my_data %>%
  ggplot(aes(predicted, observed)) +
  geom_point(color = "red") +
  geom_smooth(method = lm) + 
  labs(x = "Predicted Power Output", 
       y = "Observed Power Output",
       title = glue('Test: Adjusted R2 = {round(enexpr(Adjusted_R2), 2)}')) +
  theme(plot.title = element_text(color = "darkgreen", size = 16, hjust = 0.5),
        axis.text.y = element_text(size = 12), 
        axis.text.x = element_text(size = 12),
        axis.title.x = element_text(size = 14), 
        axis.title.y = element_text(size = 14))

# Convert ggplot to plotly for interactive visualization
g %>% ggplotly()
## `geom_smooth()` using formula = 'y ~ x'
# Make predictions on the training set
y_pred_train <- model %>% h2o.predict(newdata = train) %>% as.data.frame()
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Evaluate model performance on the training set
train_set <- train %>% as.data.frame()
residuals_train <- train_set$cty - y_pred_train$predict
RMSE_train = sqrt(mean(residuals_train^2))
y_train_mean = mean(train_set$cty)

# Calculate R-squared and Adjusted R-squared for training set
tss_train = sum((train_set$cty - y_train_mean)^2)
rss_train = sum(residuals_train^2)
R2_train = 1 - (rss_train/tss_train)
n_train <- train_set %>% nrow()
k_train <- features %>% length()  
Adjusted_R2_train = 1 - (1 - R2_train) * ((n_train - 1) / (n_train - k_train - 1))

# Create a dataframe for observed and predicted values on the training set
my_data_train <- cbind(predicted = y_pred_train$predict,
                       observed = train_set$cty) %>% 
  as.data.frame()

# Visualize the results on the training set using ggplot
g_train <- my_data_train %>% 
  ggplot(aes(predicted, observed)) + 
  geom_point(color = "darkred") + 
  geom_smooth(method = lm) + 
  labs(x = "Predicted Power Output", 
       y = "Observed Power Output",
       title = glue('Train: Adjusted R2 = {round(enexpr(Adjusted_R2_train), 2)}')) +
  theme(plot.title = element_text(color = "darkgreen", size = 16, hjust = 0.5),
        axis.text.y = element_text(size = 12), 
        axis.text.x = element_text(size = 12),
        axis.title.x = element_text(size = 14), 
        axis.title.y = element_text(size = 14))

# Convert ggplot to plotly for interactive visualization
g_train %>% ggplotly()
## `geom_smooth()` using formula = 'y ~ x'
# Display results for both training and test sets
library(patchwork)
g_train + g
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Display final evaluation metrics
tibble(RMSE_train = round(RMSE_train, 1),
       RMSE_test = round(RMSE, 1),
       Adjusted_R2_train,
       Adjusted_R2_test = Adjusted_R2)
## # A tibble: 1 × 4
##   RMSE_train RMSE_test Adjusted_R2_train Adjusted_R2_test
##        <dbl>     <dbl>             <dbl>            <dbl>
## 1        1.9       2.2             0.735            0.697